# Working Directory
setwd("~/Desktop/PolicingTStops/BadApples")

# Libraries
library(readr)
library(dplyr)

##
## Cleaning the Raw Data
##

# Raw Data
ct14 = read_csv("Data/CT/connecticut-r1.csv")
ct14$ReportingOfficerIdentificationID = as.character(ct14$ReportingOfficerIdentificationID)
ct14$InterventionIdentificationID = as.character(ct14$InterventionIdentificationID)
ct14$SourceReferenceId = as.character(ct14$SourceReferenceId)
ct14$InterventionDate = as.character(ct14$InterventionDate)
ct14$InterventionDateTime = as.character(ct14$InterventionDateTime)
ct14$InterventionLocationLatitude = as.character(ct14$InterventionLocationLatitude)
ct14$InterventionLocationLongitude = as.character(ct14$InterventionLocationLongitude)

ct15 = read_csv("Data/CT/connecticut-r2.csv")
ct15$ReportingOfficerIdentificationID = as.character(ct15$ReportingOfficerIdentificationID)
ct15$InterventionIdentificationID = as.character(ct15$InterventionIdentificationID)
ct15$SourceReferenceId = as.character(ct15$SourceReferenceId)
ct15$InterventionDate = as.character(ct15$InterventionDate)
ct15$InterventionDateTime = as.character(ct15$InterventionDateTime)
ct15$InterventionLocationLatitude = as.character(ct15$InterventionLocationLatitude)
ct15$InterventionLocationLongitude = as.character(ct15$InterventionLocationLongitude)

ct16 = read_csv("Data/CT/connecticut-r3.csv")
ct16$ReportingOfficerIdentificationID = as.character(ct16$ReportingOfficerIdentificationID)
ct16$InterventionIdentificationID = as.character(ct16$InterventionIdentificationID)
ct16$SourceReferenceId = as.character(ct16$SourceReferenceId)
ct16$InterventionDate = as.character(ct16$InterventionDate)
ct16$InterventionDateTime = as.character(ct16$InterventionDateTime)
ct16$InterventionLocationLatitude = as.character(ct16$InterventionLocationLatitude)
ct16$InterventionLocationLongitude = as.character(ct16$InterventionLocationLongitude)

ct17 = read_csv("Data/CT/connecticut-r4.csv")
ct17$ReportingOfficerIdentificationID = as.character(ct17$ReportingOfficerIdentificationID)
ct17$InterventionIdentificationID = as.character(ct17$InterventionIdentificationID)
ct17$SourceReferenceId = as.character(ct17$SourceReferenceId)
ct17$InterventionDate = as.character(ct17$InterventionDate)
ct17$InterventionDateTime = as.character(ct17$InterventionDateTime)
ct17$InterventionLocationLatitude = as.character(ct17$InterventionLocationLatitude)
ct17$InterventionLocationLongitude = as.character(ct17$InterventionLocationLongitude)

ct18 = read_csv("Data/CT/connecticut-r5.csv")
ct18$ReportingOfficerIdentificationID = as.character(ct18$ReportingOfficerIdentificationID)
ct18$InterventionIdentificationID = as.character(ct18$InterventionIdentificationID)
ct18$SourceReferenceId = as.character(ct18$SourceReferenceId)
ct18$InterventionDate = as.character(ct18$InterventionDate)
ct18$InterventionDateTime = as.character(ct18$InterventionDateTime)
ct18$InterventionLocationLatitude = as.character(ct18$InterventionLocationLatitude)
ct18$InterventionLocationLongitude = as.character(ct18$InterventionLocationLongitude)

ct19 = read_csv("Data/CT/connecticut-r6.csv")
ct19$ReportingOfficerIdentificationID = as.character(ct19$ReportingOfficerIdentificationID)
ct19$InterventionIdentificationID = as.character(ct19$InterventionIdentificationID)
ct19$SourceReferenceId = as.character(ct19$SourceReferenceId)
ct19$InterventionDate = as.character(ct19$InterventionDate)
ct19$InterventionDateTime = as.character(ct19$InterventionDateTime)
ct19$InterventionLocationLatitude = as.character(ct19$InterventionLocationLatitude)
ct19$InterventionLocationLongitude = as.character(ct19$InterventionLocationLongitude)

# Binding them together.
ct = bind_rows(ct14,ct15,ct16,
               ct17,ct18,ct19)

# Year
ct$year = ifelse(grepl("2013",ct$InterventionDate),2013,
                 ifelse(grepl("2014",ct$InterventionDate),2014,
                        ifelse(grepl("2015",ct$InterventionDate),2015,
                               ifelse(grepl("2016",ct$InterventionDate),2016,
                                      ifelse(grepl("2017",ct$InterventionDate),2017,
                                             ifelse(grepl("2018",ct$InterventionDate),2018,
                                                    ifelse(grepl("2019",ct$InterventionDate),2019,
                                                           9999)))))))

table(as.character(ct$year))

ct = subset(ct,as.numeric(as.character(ct$year))>2013)

ct$race.ethnicity = ifelse(ct$SubjectRaceCode=="B"&
                             ct$SubjectEthnicityCode=="N","Black",
                           ifelse(ct$SubjectRaceCode=="W"&
                                    ct$SubjectEthnicityCode=="N","White",
                                  ifelse(ct$SubjectEthnicityCode=="H","Latinx",NA)))

ct$stopoccur = 1
ct$searchoccur = ifelse(ct$SearchAuthorizationCode=="N",0,1)
ct$searchoccur = ifelse(ct$ContrabandIndicator==1,1,ct$searchoccur)
ct$contraband = ifelse(is.na(ct$ContrabandIndicator),0,
                       ifelse(ct$ContrabandIndicator==1&
                                ct$searchoccur==1,1,0))

ct$stop.white = ifelse(ct$race.ethnicity=="White",1,0)
ct$search.white = ifelse(ct$race.ethnicity=="White"&ct$searchoccur==1,1,0)
ct$contra.white = ifelse(ct$race.ethnicity=="White"&ct$contraband==1,1,0)
ct$stop.black = ifelse(ct$race.ethnicity=="Black",1,0)
ct$search.black = ifelse(ct$race.ethnicity=="Black"&ct$searchoccur==1,1,0)
ct$contra.black = ifelse(ct$race.ethnicity=="Black"&ct$contraband==1,1,0)

ct$unique.officer = paste(ct$`Department Name`,
                          ct$ReportingOfficerIdentificationID,sep="::")

table(is.na(ct$ReportingOfficerIdentificationID))

ct = subset(ct,!is.na(ct$ReportingOfficerIdentificationID))

save(ct,file="Data/CT_FullFile.RData")

# Collapsing by Officer
ct.officer.temp = aggregate(ct[,c("stop.white","search.white","contra.white",
                                  "stop.black","search.black","contra.black")],
                            by=list(ct$unique.officer),sum,na.rm=T)
ct.officer = aggregate(ct[,c("stopoccur","searchoccur","contraband")],
                       by=list(ct$unique.officer,ct$race.ethnicity),
                       sum,na.rm=T)
ct.officer.ids = ct.officer.temp$Group.1[ct.officer.temp$stop.white>49&
                                           ct.officer.temp$stop.black>49&
                                           ct.officer.temp$search.white>0&
                                           ct.officer.temp$search.black>0]
ct.officer$include = ifelse(ct.officer$Group.1%in%ct.officer.ids,1,0)
ct.officer.ids2 = ct.officer.temp$Group.1[ct.officer.temp$stop.white>49&
                                            ct.officer.temp$stop.black>49]
ct.officer$include2 = ifelse(ct.officer$Group.1%in%ct.officer.ids2,1,0)

save(ct.officer,file="Data/CT_Officers_50.RData")

##
## CT -- Fitting the Officer Model
##

rm(list=ls())

# If starting with the collapsed data sets, start here. 
# If starting from the rwa data, continue at this point
# after creating the collapsed data sets. 

# Load in the collapsed data sets. 
load("Data/CT_Officers_50.RData")

# Setting up the officer stan list and cropping the data set further
ct.officer.sm = subset(ct.officer, ct.officer$include==1)
table(ct.officer.sm$include,ct.officer.sm$include2)

ct.officer.sm$state.agency = ifelse(grepl("CSP ",ct.officer.sm$Group.1),1,0)
ct.officer.sm$state.agency = ifelse(grepl("DMV ",ct.officer.sm$Group.1),1,
                                    ct.officer.sm$state.agency)
ct.officer.sm$state.agency = ifelse(grepl("DMV",ct.officer.sm$Group.1),1,
                                    ct.officer.sm$state.agency)
ct.officer.sm$state.agency = ifelse(grepl("MTA",ct.officer.sm$Group.1),1,
                                    ct.officer.sm$state.agency)
ct.officer.sm$state.agency = ifelse(grepl("Yale",ct.officer.sm$Group.1),1,
                                    ct.officer.sm$state.agency)
ct.officer.sm$state.agency = ifelse(grepl("State Police",ct.officer.sm$Group.1),1,
                                    ct.officer.sm$state.agency)
ct.officer.sm$state.agency = ifelse(grepl("SU",ct.officer.sm$Group.1),1,
                                    ct.officer.sm$state.agency)
ct.officer.sm$state.agency = ifelse(grepl("UCONN",ct.officer.sm$Group.1),1,
                                    ct.officer.sm$state.agency)
ct.officer.sm$state.agency = ifelse(grepl("CAPITOL POLICE",ct.officer.sm$Group.1),1,
                                    ct.officer.sm$state.agency)
ct.officer.sm$state.agency = ifelse(grepl("Tribal",ct.officer.sm$Group.1),1,
                                    ct.officer.sm$state.agency)

ct.officer.sm = subset(ct.officer.sm,ct.officer.sm$state.agency == 0)
table(ct.officer.sm$state.agency)

ct.officer.sm = subset(ct.officer.sm, ct.officer.sm$Group.2=="White"|
                         ct.officer.sm$Group.2=="Black")

table(ct.officer.sm$include,ct.officer.sm$include2)

ct.officer.sm$Department = apply(as.matrix(as.character(ct.officer.sm$Group.1)),1,
                                 function(x){strsplit(x,"::",fixed=T)[[1]][1]})
table(ct.officer.sm$Department)

colnames(ct.officer.sm) = c("officer","driver_rg","stops","searches","contraband",
                            "include","include2","state.agency","Department")
table(ct.officer.sm$Department)

save(ct.officer.sm, file="Data/CT_OfficerSm_Data_50.RData")

# Officer set up for running Stan model. 
ct.officer.stan = list(N = nrow(ct.officer.sm),
                       R = length(unique(ct.officer.sm$driver_rg)),
                       D = length(unique(ct.officer.sm$officer)),
                       K = length(unique(ct.officer.sm$Department)),
                       r = as.integer(as.factor(ct.officer.sm$driver_rg)),
                       d = as.integer(as.factor(ct.officer.sm$officer)),
                       k = as.integer(as.factor(ct.officer.sm$Department)),
                       n = ct.officer.sm$stops,
                       s = ct.officer.sm$searches,
                       h = ct.officer.sm$contraband)

# The original specification is 5 chains, a 4000 warmup iteaations, 
# 12,000 iterations that are kept, and a delta of 0.95.
library(rstan)
rstan_options(auto_write = TRUE)

options(mc.cores = 5)
ct.model.multi = stan("Scripts/fasttt-master/stan_models/working_countrywide_mixture2.stan",
                      data = ct.officer.stan,
                      chains = 5,
                      cores = 5,
                      warmup = 5000,#Orig: 4000
                      iter = 25000,#Oirg: 12500
                      thin = 5,
                      seed = 98713,
                      control = list(adapt_delta = 0.9,#Orig: 0.95
                                     max_treedepth = 20))
save(ct.model.multi,file="Models/CT_FastTT_50.RData")

###############################################################################
###############################################################################
###############################################################################

##
## CT -- Analysis
##

# Clear the Space
rm(list=ls())

# Load in the Data
load("Data/CT_OfficerSm_Data_50.RData")
load("Models/CT_FastTT_50.RData")

# Traceplot for a random sample of thresholds. 
set.seed(987)
t_for_trace = sample(1:1894,8)
t_names_for_trace = paste("t_i[",t_for_trace,"]",sep="")

png("Figures/CT_Traceplot_50.png",1254,769)
traceplot(ct.model.multi,
          pars=t_names_for_trace,nrow=2)
dev.off()

png("Figures/CT_SampOffDensity_50.png",629,592)
plot(ct.model.multi, show_density=TRUE, 
     pars=t_names_for_trace, 
     ci_level=0.95, outer_level=0.999)
dev.off()

# Checking that it's only phi_d[1] and XXX that have
# missing values for R Hat. Checking that R Hat is 
# less than 1.01 for all estimates. 
ct.summary = summary(ct.model.multi)$summary

ct.summary[which(is.na(ct.summary[,10])),]
table(ct.summary[,10]<1)
table(ct.summary[,10]<1.01)

# Cropping the model output to what is needed.
ct.summary.ti = ct.summary[2:1895,]
ct.output = data.frame("fasttt.mean" = ct.summary.ti[,1],
                       "fasttt.lower" = ct.summary.ti[,1]-
                         1.96*ct.summary.ti[,2],
                       "fasttt.upper" = ct.summary.ti[,1]-
                         1.96*ct.summary.ti[,2])

# Merging in that information. 
ct.officer.analysis = cbind(ct.officer.sm,ct.output)

# Transforming the data set from long to wide. 
ct.officer.sm.temp = ct.officer.analysis[,c(1:5,10:12)]
ct.officer.wide = tidyr::pivot_wider(data=ct.officer.sm.temp,
                                     id_cols="officer",
                                     names_from="driver_rg",
                                     values_from = c("stops","searches","contraband",
                                                     "fasttt.mean","fasttt.lower",
                                                     "fasttt.upper"))

# Calculating additional values of interest.
ct.officer.wide$search_rate_white = ct.officer.wide$searches_White/
  ct.officer.wide$stops_White
ct.officer.wide$search_rate_black = ct.officer.wide$searches_Black/
  ct.officer.wide$stops_Black
ct.officer.wide$hit_rate_white = ifelse(ct.officer.wide$searches_White>0&
                                          ct.officer.wide$searches_Black>0,
                                        ct.officer.wide$contraband_White/
                                          ct.officer.wide$searches_White,NA)
ct.officer.wide$hit_rate_black = ifelse(ct.officer.wide$searches_Black>0&
                                          ct.officer.wide$searches_White>0,
                                        ct.officer.wide$contraband_Black/
                                          ct.officer.wide$searches_Black,NA)
ct.officer.wide$dif.tt = ct.officer.wide$fasttt.mean_White-
  ct.officer.wide$fasttt.mean_Black
ct.officer.wide$dif.tt.cat = ifelse(ct.officer.wide$dif.tt>0,
                                    ifelse(ct.officer.wide$fasttt.lower_White>
                                             ct.officer.wide$fasttt.upper_Black,
                                           "3 Black Drivers Disp. Neg.",
                                           "2 Approx. Equal"),
                                    ifelse(ct.officer.wide$fasttt.lower_Black>
                                             ct.officer.wide$fasttt.upper_White,
                                           "1 White Drivers Disp. Neg.",
                                           "2 Approx. Equal"))
ct.officer.wide$dif.sr = ct.officer.wide$search_rate_white-
  ct.officer.wide$search_rate_black
ct.officer.wide$dif.sr.cat = ifelse(ct.officer.wide$dif.sr==0,
                                    "2 Apprrox. Equal",
                                    ifelse(ct.officer.wide$dif.sr>0,
                                           "1 White Drivers Disp. Neg.",
                                           "3 Black Drivers Disp. Neg."))
ct.officer.wide$dif.hr = ct.officer.wide$hit_rate_white-
  ct.officer.wide$hit_rate_black
ct.officer.wide$dif.hr.cat = ifelse(ct.officer.wide$dif.hr==0,
                                    "2 Approx. Equal",
                                    ifelse(ct.officer.wide$dif.hr<0,
                                           "1 White Drivers Disp. Neg.",
                                           "3 Black Drivers Disp. Neg."))
save(ct.officer.wide,file="Data/CT_WideOfficerData.RData")

dim(ct.officer.wide)
length(unique(apply(as.matrix(as.character(ct.officer.wide$officer)),1,function(x){strsplit(x,"::",fixed=T)[[1]][1]})))
sum(ct.officer.wide$stops_Black)+sum(ct.officer.wide$stops_White)
sum(ct.officer.wide$searches_Black)+sum(ct.officer.wide$searches_White)

fig1b = ggplot(ct.officer.wide,aes(fasttt.mean_White,fasttt.mean_Black)) + 
  geom_point(size=1.5) + theme_bw(base_size=15) +
  xlab(NULL) + ylab(NULL) +
  geom_abline(intercept = 0, slope = 1,color = "darkgray",size = 0.5) +
  geom_abline(intercept = 0, slope = 2,color = "darkgray",linetype="dashed",size = 0.5) +
  geom_abline(intercept = 0, slope = 0.5,color = "darkgray",linetype="dashed",size = 0.5) +
  xlim(0,5)+ylim(0,5)+
  theme(panel.grid.major = element_line(colour = NA),
        panel.grid.minor = element_line(colour=NA),
        legend.position = "none")
ggsave(plot=fig1b,filename = "Figures/Figure1b.eps")
png("Figures/CT_TT_Officers.png",547,441)
fig1b
dev.off()

fig1a = ggplot(ct.officer.wide,aes(search_rate_white,search_rate_black)) + 
  geom_point(size=1.5) + theme_bw(base_size=15) +
  xlab(NULL) + ylab(NULL) +
  geom_abline(intercept = 0, slope = 1,color = "darkgray",size = 0.5) +
  geom_abline(intercept = 0, slope = 2,color = "darkgray",linetype="dashed",size = 0.5) +
  geom_abline(intercept = 0, slope = 0.5,color = "darkgray",linetype="dashed",size = 0.5) +
  xlim(0,1)+ylim(0,1)+
  theme(panel.grid.major = element_line(colour = NA),
        panel.grid.minor = element_line(colour=NA),
        legend.position = "none")
ggsave(plot=fig1a,filename = "Figures/Figure1a.eps",5.93, 5.79)
png("Figures/CT_SR_Officers.png",547,441)
fig1a
dev.off()

srr.ct = ct.officer.wide$search_rate_black/ct.officer.wide$search_rate_white

prop.table(table(srr.ct>=2 | srr.ct <=0.5))
